In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import time
from collections import Counter
do_prints = False
use_STI = True
Get the list of courses in the correct order
In [2]:
if use_STI :
courses = pd.read_pickle("../data/cleaned_courses_STI.pickle")
else:
courses = pd.read_pickle("../data/cleaned_courses.pickle")
courses = courses.index.tolist()
In [3]:
probs = pd.DataFrame(index = courses, columns = courses).fillna(0)
probs.head()
Out[3]:
In [4]:
if use_STI:
enrol = pd.read_pickle("../data/cleaned_enrol_STI.pickle")
else:
enrol = pd.read_pickle("../data/cleaned_enrol.pickle")
enrol.head()
Out[4]:
New Proba
In [5]:
Students=enrol['PersonID'].unique()
In [6]:
students_courses_df=enrol[['PersonID','CourseCodes']].groupby('PersonID').apply(lambda x: x.to_dict('list')['CourseCodes'])
students_courses_dico=students_courses_df.to_dict()
In [7]:
import itertools
In [8]:
weights_wt_students=np.zeros((len(courses),len(courses))) # weight matrix 1
w1=1 # weight for each edge
for person in tqdm(Students): # for each student ...
for course1, course2 in itertools.combinations(students_courses_dico[person], 2):
if(course1==course2):
# enlever les redoublants
print("is there?")
continue
probs.loc[course1,course2]+=w1 # add a weight between the courses
probs.loc[course2,course1]+=w1 # add a weight between the courses
In [9]:
for course in tqdm(courses):
student_per_course=(enrol.loc[ enrol["CourseCodes"] == course , "PersonID"].count())
probs[probs.index==course]=probs[probs.index==course]/student_per_course
probs.loc[course,course]=1.
In [10]:
probs.head()
Out[10]:
In [12]:
probs.to_pickle(os.path.join("Graphs","probs.pkl"))
In [ ]: